import pandas as pd
import numpy as np
import math as math
import matplotlib.pyplot as plt
import statsmodels.api as sm
from plotnine import *
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict
from sklearn import datasets, svm, tree
from scipy import stats
arms_sales = pd.read_csv('TIV-Import-All-1950-2020.csv')
# Filling all the NaN values with 0.0 for later calculation
arms_sales.fillna(value = 0, inplace = True)
arms_sales_original = arms_sales
arms_sales.head()
| Country | 1950 | 1951 | 1952 | 1953 | 1954 | 1955 | 1956 | 1957 | 1958 | ... | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | Total | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 7.0 | 89.0 | 254.0 | ... | 523.0 | 235.0 | 288.0 | 116.0 | 174.0 | 252.0 | 349.0 | 382.0 | 227.0 | 16391.0 |
| 1 | African Union** | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 2.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 | 79.0 |
| 2 | Albania | 20.0 | 10.0 | 11.0 | 20.0 | 22.0 | 132.0 | 98.0 | 42.0 | 47.0 | ... | 15.0 | 7.0 | 7.0 | 7.0 | 0.0 | 4.0 | 2.0 | 5.0 | 0.0 | 2771.0 |
| 3 | Algeria | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 661.0 | 362.0 | 620.0 | 898.0 | 2903.0 | 1128.0 | 1284.0 | 164.0 | 549.0 | 32139.0 |
| 4 | Amal (Lebanon)* | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 60.0 |
5 rows × 73 columns
#================================================
#============= Moving Data Around ===============
#================================================
# Sorting the values by the 'Total' of the row to find the highest values in the data.
arms_sales.sort_values(by=['Total'], ascending=False, inplace=True)
# Rearranging values, bringing 'Total' to the first column in the df.
cols = arms_sales.columns.tolist()
cols = cols[-1:] + cols[:-1]
# cols
arms_sales = arms_sales[cols]
arms_sales.head()
| Total | Country | 1950 | 1951 | 1952 | 1953 | 1954 | 1955 | 1956 | 1957 | ... | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 260 | 1973386.0 | Total | 8139.0 | 11497.0 | 16533.0 | 24027.0 | 20371.0 | 20147.0 | 24262.0 | 23556.0 | ... | 30043.0 | 27995.0 | 27174.0 | 27073.0 | 28638.0 | 31438.0 | 31762.0 | 27028.0 | 27105.0 | 22831.0 |
| 94 | 126727.0 | India | 141.0 | 277.0 | 104.0 | 430.0 | 265.0 | 350.0 | 366.0 | 980.0 | ... | 3598.0 | 4392.0 | 5381.0 | 3347.0 | 3117.0 | 3003.0 | 2909.0 | 1485.0 | 3075.0 | 2799.0 |
| 39 | 80828.0 | China | 2572.0 | 2197.0 | 3430.0 | 3144.0 | 3500.0 | 1730.0 | 2255.0 | 2126.0 | ... | 1055.0 | 1571.0 | 1289.0 | 1077.0 | 1262.0 | 1139.0 | 1338.0 | 1962.0 | 1347.0 | 811.0 |
| 193 | 70781.0 | Saudi Arabia | 0.0 | 0.0 | 7.0 | 1.0 | 2.0 | 16.0 | 10.0 | 72.0 | ... | 1222.0 | 1033.0 | 1614.0 | 2740.0 | 3377.0 | 2961.0 | 3931.0 | 3315.0 | 3419.0 | 2466.0 |
| 58 | 67745.0 | Egypt | 163.0 | 57.0 | 0.0 | 1.0 | 142.0 | 725.0 | 1065.0 | 479.0 | ... | 630.0 | 281.0 | 673.0 | 420.0 | 1436.0 | 1690.0 | 2395.0 | 1677.0 | 1046.0 | 1311.0 |
5 rows × 73 columns
# Dropping row that is the total of all values. Not useful for the analysis. This part is optional.
arms_sales = arms_sales.drop(labels=[260], axis=0)
arms_sales.head()
| Total | Country | 1950 | 1951 | 1952 | 1953 | 1954 | 1955 | 1956 | 1957 | ... | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 94 | 126727.0 | India | 141.0 | 277.0 | 104.0 | 430.0 | 265.0 | 350.0 | 366.0 | 980.0 | ... | 3598.0 | 4392.0 | 5381.0 | 3347.0 | 3117.0 | 3003.0 | 2909.0 | 1485.0 | 3075.0 | 2799.0 |
| 39 | 80828.0 | China | 2572.0 | 2197.0 | 3430.0 | 3144.0 | 3500.0 | 1730.0 | 2255.0 | 2126.0 | ... | 1055.0 | 1571.0 | 1289.0 | 1077.0 | 1262.0 | 1139.0 | 1338.0 | 1962.0 | 1347.0 | 811.0 |
| 193 | 70781.0 | Saudi Arabia | 0.0 | 0.0 | 7.0 | 1.0 | 2.0 | 16.0 | 10.0 | 72.0 | ... | 1222.0 | 1033.0 | 1614.0 | 2740.0 | 3377.0 | 2961.0 | 3931.0 | 3315.0 | 3419.0 | 2466.0 |
| 58 | 67745.0 | Egypt | 163.0 | 57.0 | 0.0 | 1.0 | 142.0 | 725.0 | 1065.0 | 479.0 | ... | 630.0 | 281.0 | 673.0 | 420.0 | 1436.0 | 1690.0 | 2395.0 | 1677.0 | 1046.0 | 1311.0 |
| 103 | 65890.0 | Japan | 0.0 | 0.0 | 8.0 | 223.0 | 314.0 | 408.0 | 757.0 | 1059.0 | ... | 268.0 | 205.0 | 252.0 | 307.0 | 329.0 | 330.0 | 441.0 | 654.0 | 899.0 | 724.0 |
5 rows × 73 columns
# Only taking the Top 50 values from the data for later analysis and resetting index values
arms_sales = arms_sales.head(50)
arms_sales.reset_index(drop=True, inplace=True)
arms_sales
| Total | Country | 1950 | 1951 | 1952 | 1953 | 1954 | 1955 | 1956 | 1957 | ... | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 126727.0 | India | 141.0 | 277.0 | 104.0 | 430.0 | 265.0 | 350.0 | 366.0 | 980.0 | ... | 3598.0 | 4392.0 | 5381.0 | 3347.0 | 3117.0 | 3003.0 | 2909.0 | 1485.0 | 3075.0 | 2799.0 |
| 1 | 80828.0 | China | 2572.0 | 2197.0 | 3430.0 | 3144.0 | 3500.0 | 1730.0 | 2255.0 | 2126.0 | ... | 1055.0 | 1571.0 | 1289.0 | 1077.0 | 1262.0 | 1139.0 | 1338.0 | 1962.0 | 1347.0 | 811.0 |
| 2 | 70781.0 | Saudi Arabia | 0.0 | 0.0 | 7.0 | 1.0 | 2.0 | 16.0 | 10.0 | 72.0 | ... | 1222.0 | 1033.0 | 1614.0 | 2740.0 | 3377.0 | 2961.0 | 3931.0 | 3315.0 | 3419.0 | 2466.0 |
| 3 | 67745.0 | Egypt | 163.0 | 57.0 | 0.0 | 1.0 | 142.0 | 725.0 | 1065.0 | 479.0 | ... | 630.0 | 281.0 | 673.0 | 420.0 | 1436.0 | 1690.0 | 2395.0 | 1677.0 | 1046.0 | 1311.0 |
| 4 | 65890.0 | Japan | 0.0 | 0.0 | 8.0 | 223.0 | 314.0 | 408.0 | 757.0 | 1059.0 | ... | 268.0 | 205.0 | 252.0 | 307.0 | 329.0 | 330.0 | 441.0 | 654.0 | 899.0 | 724.0 |
| 5 | 57094.0 | Turkey | 117.0 | 136.0 | 315.0 | 514.0 | 473.0 | 335.0 | 198.0 | 544.0 | ... | 780.0 | 1503.0 | 793.0 | 1524.0 | 438.0 | 328.0 | 428.0 | 487.0 | 754.0 | 86.0 |
| 6 | 56204.0 | Germany | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 85.0 | 1356.0 | 2270.0 | ... | 84.0 | 131.0 | 113.0 | 95.0 | 96.0 | 55.0 | 43.0 | 14.0 | 33.0 | 33.0 |
| 7 | 55883.0 | South Korea | 155.0 | 256.0 | 266.0 | 263.0 | 158.0 | 354.0 | 454.0 | 278.0 | ... | 1570.0 | 1078.0 | 191.0 | 723.0 | 254.0 | 1058.0 | 1052.0 | 1100.0 | 1480.0 | 1317.0 |
| 8 | 54271.0 | Iraq | 0.0 | 0.0 | 59.0 | 53.0 | 19.0 | 61.0 | 105.0 | 38.0 | ... | 574.0 | 476.0 | 353.0 | 665.0 | 1464.0 | 1789.0 | 1004.0 | 543.0 | 175.0 | 1.0 |
| 9 | 47226.0 | United States | 352.0 | 695.0 | 875.0 | 1315.0 | 1896.0 | 1892.0 | 2071.0 | 1371.0 | ... | 1012.0 | 1207.0 | 791.0 | 590.0 | 513.0 | 451.0 | 516.0 | 380.0 | 866.0 | 687.0 |
| 10 | 46629.0 | Iran | 4.0 | 6.0 | 0.0 | 1.0 | 75.0 | 110.0 | 121.0 | 53.0 | ... | 98.0 | 44.0 | 13.0 | 13.0 | 13.0 | 413.0 | 4.0 | 4.0 | 3.0 | 2.0 |
| 11 | 44779.0 | Israel | 53.0 | 92.0 | 73.0 | 112.0 | 49.0 | 117.0 | 641.0 | 133.0 | ... | 85.0 | 120.0 | 209.0 | 541.0 | 681.0 | 605.0 | 536.0 | 543.0 | 546.0 | 474.0 |
| 12 | 43252.0 | Poland | 190.0 | 575.0 | 794.0 | 1021.0 | 1999.0 | 1098.0 | 687.0 | 1080.0 | ... | 137.0 | 223.0 | 145.0 | 259.0 | 140.0 | 55.0 | 154.0 | 74.0 | 325.0 | 193.0 |
| 13 | 42706.0 | Taiwan | 0.0 | 98.0 | 304.0 | 383.0 | 320.0 | 622.0 | 256.0 | 436.0 | ... | 197.0 | 422.0 | 548.0 | 1077.0 | 629.0 | 97.0 | 493.0 | 99.0 | 64.0 | 102.0 |
| 14 | 42598.0 | Australia | 468.0 | 371.0 | 410.0 | 153.0 | 125.0 | 637.0 | 451.0 | 445.0 | ... | 1557.0 | 867.0 | 245.0 | 906.0 | 1464.0 | 1026.0 | 1653.0 | 1557.0 | 1184.0 | 1658.0 |
| 15 | 40278.0 | Greece | 97.0 | 500.0 | 307.0 | 85.0 | 152.0 | 203.0 | 92.0 | 354.0 | ... | 70.0 | 18.0 | 28.0 | 204.0 | 749.0 | 323.0 | 51.0 | 38.0 | 93.0 | 85.0 |
| 16 | 40048.0 | Syria | 106.0 | 1.0 | 20.0 | 18.0 | 9.0 | 74.0 | 106.0 | 381.0 | ... | 493.0 | 516.0 | 506.0 | 21.0 | 22.0 | 8.0 | 25.0 | 181.0 | 0.0 | 72.0 |
| 17 | 38819.0 | Pakistan | 194.0 | 169.0 | 145.0 | 95.0 | 145.0 | 195.0 | 158.0 | 529.0 | ... | 1128.0 | 1014.0 | 1095.0 | 828.0 | 779.0 | 837.0 | 837.0 | 799.0 | 521.0 | 759.0 |
| 18 | 35317.0 | United Kingdom | 105.0 | 246.0 | 532.0 | 1591.0 | 512.0 | 315.0 | 126.0 | 90.0 | ... | 366.0 | 583.0 | 494.0 | 211.0 | 397.0 | 259.0 | 1002.0 | 519.0 | 342.0 | 764.0 |
| 19 | 32406.0 | Canada | 264.0 | 294.0 | 998.0 | 1884.0 | 1123.0 | 1274.0 | 721.0 | 764.0 | ... | 336.0 | 197.0 | 202.0 | 316.0 | 408.0 | 233.0 | 331.0 | 160.0 | 174.0 | 207.0 |
| 20 | 32219.0 | Italy | 158.0 | 232.0 | 883.0 | 1309.0 | 308.0 | 587.0 | 983.0 | 773.0 | ... | 286.0 | 207.0 | 81.0 | 141.0 | 253.0 | 760.0 | 735.0 | 317.0 | 183.0 | 131.0 |
| 21 | 32139.0 | Algeria | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1125.0 | 661.0 | 362.0 | 620.0 | 898.0 | 2903.0 | 1128.0 | 1284.0 | 164.0 | 549.0 |
| 22 | 31079.0 | Libya | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | ... | 8.0 | 73.0 | 116.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 23 | 30708.0 | Netherlands | 154.0 | 454.0 | 423.0 | 975.0 | 784.0 | 736.0 | 1171.0 | 695.0 | ... | 120.0 | 254.0 | 451.0 | 9.0 | 60.0 | 75.0 | 33.0 | 143.0 | 497.0 | 610.0 |
| 24 | 30552.0 | East Germany (GDR) | 29.0 | 0.0 | 178.0 | 92.0 | 10.0 | 0.0 | 678.0 | 1122.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 25 | 30039.0 | Soviet Union | 36.0 | 0.0 | 0.0 | 8.0 | 70.0 | 945.0 | 1538.0 | 1500.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 26 | 29352.0 | Czechoslovakia | 90.0 | 247.0 | 707.0 | 1607.0 | 1003.0 | 1017.0 | 573.0 | 253.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 27 | 29281.0 | UAE | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1235.0 | 1154.0 | 2306.0 | 768.0 | 1224.0 | 956.0 | 966.0 | 1196.0 | 691.0 | 432.0 |
| 28 | 26954.0 | Viet Nam | 0.0 | 0.0 | 0.0 | 0.0 | 10.0 | 19.0 | 9.0 | 54.0 | ... | 1010.0 | 780.0 | 362.0 | 1212.0 | 855.0 | 909.0 | 783.0 | 604.0 | 160.0 | 41.0 |
| 29 | 25618.0 | Spain | 1.0 | 0.0 | 0.0 | 78.0 | 88.0 | 217.0 | 273.0 | 287.0 | ... | 168.0 | 224.0 | 155.0 | 109.0 | 135.0 | 86.0 | 82.0 | 142.0 | 40.0 | 57.0 |
| 30 | 23975.0 | North Korea | 861.0 | 372.0 | 211.0 | 674.0 | 263.0 | 404.0 | 479.0 | 428.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 31 | 20353.0 | France | 218.0 | 988.0 | 1180.0 | 1739.0 | 1659.0 | 1578.0 | 1446.0 | 738.0 | ... | 48.0 | 90.0 | 107.0 | 17.0 | 42.0 | 45.0 | 76.0 | 183.0 | 99.0 | 145.0 |
| 32 | 19928.0 | Bulgaria | 156.0 | 63.0 | 97.0 | 349.0 | 617.0 | 307.0 | 305.0 | 514.0 | ... | 29.0 | 5.0 | 0.0 | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 2.0 | 44.0 |
| 33 | 19538.0 | Indonesia | 195.0 | 58.0 | 14.0 | 6.0 | 16.0 | 25.0 | 42.0 | 30.0 | ... | 249.0 | 212.0 | 801.0 | 1177.0 | 438.0 | 377.0 | 1171.0 | 329.0 | 217.0 | 270.0 |
| 34 | 19163.0 | Singapore | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 0.0 | ... | 895.0 | 745.0 | 749.0 | 65.0 | 147.0 | 633.0 | 430.0 | 594.0 | 636.0 | 74.0 |
| 35 | 18439.0 | Belgium | 198.0 | 485.0 | 972.0 | 1098.0 | 604.0 | 361.0 | 682.0 | 680.0 | ... | 21.0 | 32.0 | 46.0 | 104.0 | 56.0 | 4.0 | 7.0 | 1.0 | 9.0 | 86.0 |
| 36 | 18170.0 | Romania | 4.0 | 76.0 | 454.0 | 904.0 | 629.0 | 359.0 | 448.0 | 186.0 | ... | 80.0 | 21.0 | 20.0 | 0.0 | 22.0 | 177.0 | 88.0 | 37.0 | 13.0 | 187.0 |
| 37 | 17476.0 | Yugoslavia | 0.0 | 182.0 | 490.0 | 525.0 | 534.0 | 334.0 | 352.0 | 229.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 38 | 17094.0 | Brazil | 0.0 | 615.0 | 0.0 | 133.0 | 112.0 | 379.0 | 55.0 | 152.0 | ... | 306.0 | 327.0 | 223.0 | 209.0 | 219.0 | 109.0 | 96.0 | 269.0 | 168.0 | 158.0 |
| 39 | 17026.0 | Norway | 113.0 | 127.0 | 426.0 | 421.0 | 19.0 | 168.0 | 282.0 | 142.0 | ... | 606.0 | 143.0 | 51.0 | 16.0 | 143.0 | 140.0 | 352.0 | 532.0 | 379.0 | 450.0 |
| 40 | 16391.0 | Afghanistan | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 7.0 | 89.0 | ... | 661.0 | 523.0 | 235.0 | 288.0 | 116.0 | 174.0 | 252.0 | 349.0 | 382.0 | 227.0 |
| 41 | 14798.0 | Thailand | 62.0 | 88.0 | 61.0 | 44.0 | 60.0 | 34.0 | 10.0 | 88.0 | ... | 268.0 | 278.0 | 357.0 | 95.0 | 174.0 | 336.0 | 302.0 | 627.0 | 295.0 | 132.0 |
| 42 | 14173.0 | Hungary | 56.0 | 53.0 | 13.0 | 802.0 | 699.0 | 222.0 | 1.0 | 59.0 | ... | 9.0 | 0.0 | 2.0 | 7.0 | 0.0 | 14.0 | 6.0 | 27.0 | 16.0 | 67.0 |
| 43 | 13128.0 | Venezuela | 61.0 | 0.0 | 25.0 | 195.0 | 135.0 | 66.0 | 406.0 | 285.0 | ... | 596.0 | 687.0 | 993.0 | 181.0 | 167.0 | 137.0 | 0.0 | 0.0 | 0.0 | 51.0 |
| 44 | 13017.0 | Peru | 2.0 | 24.0 | 76.0 | 0.0 | 111.0 | 35.0 | 175.0 | 117.0 | ... | 63.0 | 10.0 | 22.0 | 152.0 | 212.0 | 223.0 | 65.0 | 94.0 | 0.0 | 19.0 |
| 45 | 12414.0 | Sweden | 225.0 | 297.0 | 316.0 | 431.0 | 319.0 | 433.0 | 555.0 | 253.0 | ... | 191.0 | 207.0 | 56.0 | 45.0 | 14.0 | 70.0 | 46.0 | 7.0 | 7.0 | 3.0 |
| 46 | 12279.0 | Switzerland | 154.0 | 151.0 | 321.0 | 169.0 | 142.0 | 230.0 | 313.0 | 223.0 | ... | 32.0 | 10.0 | 6.0 | 5.0 | 12.0 | 73.0 | 34.0 | 10.0 | 13.0 | 10.0 |
| 47 | 12088.0 | Argentina | 75.0 | 445.0 | 38.0 | 4.0 | 5.0 | 0.0 | 39.0 | 14.0 | ... | 15.0 | 3.0 | 12.0 | 7.0 | 21.0 | 17.0 | 47.0 | 21.0 | 48.0 | 7.0 |
| 48 | 11897.0 | Jordan | 0.0 | 0.0 | 0.0 | 0.0 | 27.0 | 34.0 | 62.0 | 0.0 | ... | 215.0 | 176.0 | 75.0 | 197.0 | 233.0 | 232.0 | 389.0 | 238.0 | 224.0 | 148.0 |
| 49 | 11893.0 | Cuba | 0.0 | 8.0 | 0.0 | 0.0 | 3.0 | 13.0 | 35.0 | 8.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
50 rows × 73 columns
#================================================
#============= Restructuring Data ===============
#================================================
# Reshaping the data just like project 3 for easier calculations and familiarity.
# Putting all years and countries into separate lists
years = arms_sales.iloc[1:4, 2:].columns.tolist()
countries = arms_sales['Country'].tolist()
# Initializing empty arrays to append data
country_list = []
year_list = []
import_list = []
totals_list = []
year_mean_list = []
# Double for-loop through all countries and years
for country in countries:
for year in years:
# Appending values
country_list.append(country)
year_list.append(year)
# Index of row where Country = Country (ex: Counter == 'China') which will be used to locate value.
country_index = arms_sales.index[arms_sales['Country'] == country][0]
import_value = arms_sales.at[country_index, year]
total_value = arms_sales.at[country_index, 'Total']
year_mean = arms_sales[year].mean()
# Appending retrieved values and calculations
import_list.append(import_value)
totals_list.append(total_value)
year_mean_list.append(year_mean)
# N_O_T_E: Entire average over top 50 can be accessed through this quick calculation:
total_mean_over_top_50 = arms_sales['Total'].mean()
# Structuring data
data = {
'country': country_list,
'year': year_list,
'imports': import_list,
'year_avg_imports': year_mean_list,
'total_imports_for_country': totals_list
}
# Placing data into pandas dataframe. N_O_T_E: Remember you can always access the original because it still exists (i.e. arms_sales). There is also the most original reference stored in arms_sales_original, which has all the intial data in addition to the top 50.
small_arms_df = pd.DataFrame(data)
small_arms_df
| country | year | imports | year_avg_imports | total_imports_for_country | |
|---|---|---|---|---|---|
| 0 | India | 1950 | 141.0 | 156.58 | 126727.0 |
| 1 | India | 1951 | 277.0 | 218.70 | 126727.0 |
| 2 | India | 1952 | 104.0 | 310.64 | 126727.0 |
| 3 | India | 1953 | 430.0 | 457.00 | 126727.0 |
| 4 | India | 1954 | 265.0 | 390.10 | 126727.0 |
| ... | ... | ... | ... | ... | ... |
| 3545 | Cuba | 2016 | 0.0 | 502.20 | 11893.0 |
| 3546 | Cuba | 2017 | 0.0 | 524.62 | 11893.0 |
| 3547 | Cuba | 2018 | 0.0 | 451.90 | 11893.0 |
| 3548 | Cuba | 2019 | 0.0 | 411.78 | 11893.0 |
| 3549 | Cuba | 2020 | 0.0 | 368.98 | 11893.0 |
3550 rows × 5 columns
#================================================
#============ Plots using all data ==============
#================================================
small_arms_df['year'] = small_arms_df['year'].astype(int)
x = small_arms_df['year'].values
y = small_arms_df['year_avg_imports'].values
plt.scatter(x, y)
plt.xlabel("Year")
plt.ylabel("Average Imports")
Text(0, 0.5, 'Average Imports')
ggplot(aes(x='year', y='year_avg_imports'), data=small_arms_df) + stat_smooth(method="lm") + geom_point(size=2) +\
labs(title="All Imports 1950-2020",
x = "Years",
y = "Imports")
<ggplot: (194654591922)>
#================================================
#==== Filtering data for later graphing =========
#================================================
# All filtered data to use in individual graphs later.
small_arms_df['year'] = small_arms_df['year'].astype(int)
filt_1950_1960 = (small_arms_df['year'] >= 1950) & (small_arms_df['year'] <= 1960)
filt_1960_1970 = (small_arms_df['year'] >= 1960) & (small_arms_df['year'] <= 1970)
filt_1970_1980 = (small_arms_df['year'] >= 1970) & (small_arms_df['year'] <= 1980)
filt_1980_1990 = (small_arms_df['year'] >= 1980) & (small_arms_df['year'] <= 1990)
filt_1990_2000 = (small_arms_df['year'] >= 1990) & (small_arms_df['year'] <= 2000)
filt_2000_2010 = (small_arms_df['year'] >= 2000) & (small_arms_df['year'] <= 2010)
filt_2010_2020 = (small_arms_df['year'] >= 2010) & (small_arms_df['year'] <= 2020)
#================================================
#==== Fitting a linear regression model =========
#================================================
# N_O_T_E: Same calculations from project3. I used the year_average as the dependant variable here, I was likely wrong, so please let me know your opinion. Honestly not good at this. The only relationship I know these variables have is that the number of imports will increase depending on the year (wartime). Don't we need some kind of expected values for fitting a model?
small_arms_df['year'] = small_arms_df['year'].astype(int)
years = np.array(small_arms_df['year'])
imports = np.array(small_arms_df['imports'])
LR = LinearRegression()
LR.fit(years.reshape(-1,1), imports)
prediction = LR.predict(years.reshape(-1,1))
small_arms_df['importsPredict'] = pd.Series(prediction, index=small_arms_df.index)
small_arms_df['residual'] = small_arms_df['imports'] - small_arms_df['importsPredict']
RM = ols(formula='year ~ year_avg_imports', data=small_arms_df).fit()
RM.summary()
| Dep. Variable: | year | R-squared: | 0.000 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | -0.000 |
| Method: | Least Squares | F-statistic: | 0.2967 |
| Date: | Sun, 09 May 2021 | Prob (F-statistic): | 0.586 |
| Time: | 09:11:41 | Log-Likelihood: | -15759. |
| No. Observations: | 3550 | AIC: | 3.152e+04 |
| Df Residuals: | 3548 | BIC: | 3.153e+04 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 1985.6562 | 1.253 | 1584.758 | 0.000 | 1983.200 | 1988.113 |
| year_avg_imports | -0.0014 | 0.003 | -0.545 | 0.586 | -0.006 | 0.004 |
| Omnibus: | 2691.101 | Durbin-Watson: | 0.165 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 209.039 |
| Skew: | -0.015 | Prob(JB): | 4.05e-46 |
| Kurtosis: | 1.812 | Cond. No. | 1.83e+03 |
#================================================
#====== Creating Bins for slicing years =========
#================================================
# Approach like from project2, cutting the data up and displaying from there.
bins = [1949, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
groups = ["1950-1960", "1960-1970", "1970-1980", "1980-1990", "1990-2000", "2000-2010", "2010-2020"]
# Created new column that depicts which range in years a row falls into.
small_arms_df['cut_year'] = pd.cut(small_arms_df.year, bins, labels=groups)
small_arms_df
| country | year | imports | year_avg_imports | total_imports_for_country | importsPredict | residual | cut_year | |
|---|---|---|---|---|---|---|---|---|
| 0 | India | 1950 | 141.0 | 156.58 | 126727.0 | 484.059343 | -343.059343 | 1950-1960 |
| 1 | India | 1951 | 277.0 | 218.70 | 126727.0 | 483.997945 | -206.997945 | 1950-1960 |
| 2 | India | 1952 | 104.0 | 310.64 | 126727.0 | 483.936547 | -379.936547 | 1950-1960 |
| 3 | India | 1953 | 430.0 | 457.00 | 126727.0 | 483.875150 | -53.875150 | 1950-1960 |
| 4 | India | 1954 | 265.0 | 390.10 | 126727.0 | 483.813752 | -218.813752 | 1950-1960 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3545 | Cuba | 2016 | 0.0 | 502.20 | 11893.0 | 480.007093 | -480.007093 | 2010-2020 |
| 3546 | Cuba | 2017 | 0.0 | 524.62 | 11893.0 | 479.945696 | -479.945696 | 2010-2020 |
| 3547 | Cuba | 2018 | 0.0 | 451.90 | 11893.0 | 479.884298 | -479.884298 | 2010-2020 |
| 3548 | Cuba | 2019 | 0.0 | 411.78 | 11893.0 | 479.822900 | -479.822900 | 2010-2020 |
| 3549 | Cuba | 2020 | 0.0 | 368.98 | 11893.0 | 479.761502 | -479.761502 | 2010-2020 |
3550 rows × 8 columns
# Wrong af, but just doing it anyway.
small_arms_df['year'] = small_arms_df['year'].astype(str)
ggplot(aes(x='cut_year', y='residual'), data=small_arms_df) + geom_violin() + labs(title="Residuals vs. Year", x="Years", y="Residuals")
<ggplot: (194654591835)>
#================================================
#=============== Multi graphs ===================
#================================================
# Imports in 10 year time periods
small_arms_df['year'] = small_arms_df['year'].astype(int)
ggplot(aes(x='year', y='year_avg_imports'), data=small_arms_df) + \
geom_point() + \
geom_smooth(method='lm') + \
facet_grid("cut_year ~ .") +\
labs(title="Increments by 10 years",
x = "Years",
y = "Imports")
<ggplot: (194655931766)>
# Just another version of the graph above.
ggplot(aes(x='year', y='year_avg_imports', color='cut_year'), data=small_arms_df) + stat_smooth(method="lm") + geom_point()
<ggplot: (194655931721)>
#================================================
#=============== Individual graphs ==============
#================================================
small_arms_df['year'] = small_arms_df['year'].astype(int)
ggplot(aes(x='year', y='year_avg_imports'), data=small_arms_df.loc[filt_1950_1960]) + stat_smooth(method="lm") + geom_point(size=1) +\
labs(title="All Imports 1950-1960",
x = "Years",
y = "Imports on Average")
<ggplot: (194655911159)>
ggplot(aes(x='year', y='year_avg_imports'), data=small_arms_df.loc[filt_1960_1970]) + stat_smooth(method="lm") + geom_point(size=1) +\
labs(title="All Imports 1960-1970",
x = "Years",
y = "Imports on Average")
<ggplot: (194656354850)>
ggplot(aes(x='year', y='year_avg_imports'), data=small_arms_df.loc[filt_1970_1980]) + stat_smooth(method="lm") + geom_point(size=1) +\
labs(title="All Imports 1970-1980",
x = "Years",
y = "Imports on Average")
<ggplot: (194655936988)>
ggplot(aes(x='year', y='year_avg_imports'), data=small_arms_df.loc[filt_1980_1990]) + stat_smooth(method="lm") + geom_point(size=1) +\
labs(title="All Imports 1980-1990",
x = "Years",
y = "Imports on Average")
<ggplot: (194656338487)>
ggplot(aes(x='year', y='year_avg_imports'), data=small_arms_df.loc[filt_1990_2000]) + stat_smooth(method="lm") + geom_point(size=1) +\
labs(title="All Imports 1990-2000",
x = "Years",
y = "Imports on Average")
<ggplot: (194656124564)>
ggplot(aes(x='year', y='year_avg_imports'), data=small_arms_df.loc[filt_2000_2010]) + stat_smooth(method="lm") + geom_point(size=1) +\
labs(title="All Imports 2000-2010",
x = "Years",
y = "Imports on Average")
<ggplot: (194656217242)>
ggplot(aes(x='year', y='year_avg_imports'), data=small_arms_df.loc[filt_2010_2020]) + stat_smooth(method="lm") + geom_point(size=1) +\
labs(title="All Imports 2010-2020",
x = "Years",
y = "Imports on Average")
<ggplot: (194656093826)>